/*
Purpose: Clean and normalize data from NAEP data explorer
*/

clear all

import excel "data/raw/state_scale_score.xls", cellrange("A9:E804") first clear
ren Year year
ren Jurisdiction state_name
drop Allstudents
ren Averagescalescore scale_score
ren Standarddeviation sd_scale
tempfile score_sd
save `score_sd', replace

import excel "data/raw/state_percentiles.xls", cellrange("A9:I804") first clear
ren Year year
ren Jurisdiction state_name
drop Allstudents Averagescalescore
ren thpercentile pctile_10
ren F pctile_25
ren G pctile_50
ren H pctile_75
ren I pctile_90

merge 1:1 year state_name using `score_sd'
assert _merge==3
drop _merge

drop if year=="2000¹"|year=="1996" //these are years with two sets of scores; dropping 1996 because no state scores and 2000¹ because the "1" means no accommodations were permitted (i.e., it is less similar to the current version of the test)
replace year = substr(year,1,4)
destring year, replace

preserve
keep if state_name=="National public"&year==1992
global mean92 = real(scale_score[1])
global sd92 = real(sd_scale[1])
restore

foreach var of varlist pctile_10-scale_score {
	drop if `var'=="—"|`var'=="‡"
	destring `var', replace
	gen `var'_norm = (`var' - $mean92) / $sd92
	drop `var'
}
drop sd_scale

statastates, name(state_name)
keep if _merge==3|state_name=="NATIONAL PUBLIC"
drop _merge

replace state_fips = 0 if state_name=="NATIONAL PUBLIC"

save "data/clean/math8_normed", replace
